In [38]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
In [39]:
train = pd.read_csv("./data/adult.data.csv")
In [40]:
test = pd.read_csv("./data/adult.test.csv")
In [41]:
train.head()
Out[41]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
In [42]:
test.head()
Out[42]:
age workclass education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
0 25 Private 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States <=50K
1 38 Private HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States <=50K
2 28 Local-gov Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States >50K
3 44 Private Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States >50K
4 18 ? Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States <=50K
In [43]:
train.replace(' ?',np.nan,inplace=True)
In [44]:
train=train.dropna()
In [45]:
train
Out[45]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
32556 27 Private 257302 Assoc-acdm 12 Married-civ-spouse Tech-support Wife White Female 0 0 38 United-States <=50K
32557 40 Private 154374 HS-grad 9 Married-civ-spouse Machine-op-inspct Husband White Male 0 0 40 United-States >50K
32558 58 Private 151910 HS-grad 9 Widowed Adm-clerical Unmarried White Female 0 0 40 United-States <=50K
32559 22 Private 201490 HS-grad 9 Never-married Adm-clerical Own-child White Male 0 0 20 United-States <=50K
32560 52 Self-emp-inc 287927 HS-grad 9 Married-civ-spouse Exec-managerial Wife White Female 15024 0 40 United-States >50K

30162 rows × 15 columns

In [46]:
test.replace(' ?',np.nan,inplace=True)
test=test.dropna()
test
Out[46]:
age workclass education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
0 25 Private 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States <=50K
1 38 Private HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States <=50K
2 28 Local-gov Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States >50K
3 44 Private Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States >50K
5 34 Private 10th 6 Never-married Other-service Not-in-family White Male 0 0 30 United-States <=50K
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
16275 33 Private Bachelors 13 Never-married Prof-specialty Own-child White Male 0 0 40 United-States <=50K
16276 39 Private Bachelors 13 Divorced Prof-specialty Not-in-family White Female 0 0 36 United-States <=50K
16278 38 Private Bachelors 13 Married-civ-spouse Prof-specialty Husband White Male 0 0 50 United-States <=50K
16279 44 Private Bachelors 13 Divorced Adm-clerical Own-child Asian-Pac-Islander Male 5455 0 40 United-States <=50K
16280 35 Self-emp-inc Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 60 United-States >50K

15060 rows × 14 columns

In [47]:
# Xóa cột fnlwgt
del train["fnlwgt"]
In [48]:
train.head()
Out[48]:
age workclass education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
0 39 State-gov Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
In [49]:
data = pd.concat([train,test])
In [50]:
print('Number of training data: ', len(train))
print('Number of training data: ', len(test))
Number of training data:  30162
Number of training data:  15060
In [51]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Index: 45222 entries, 0 to 16280
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             45222 non-null  int64 
 1   workclass       45222 non-null  object
 2   education       45222 non-null  object
 3   education-num   45222 non-null  int64 
 4   marital-status  45222 non-null  object
 5   occupation      45222 non-null  object
 6   relationship    45222 non-null  object
 7   race            45222 non-null  object
 8   sex             45222 non-null  object
 9   capital-gain    45222 non-null  int64 
 10  capital-loss    45222 non-null  int64 
 11  hours-per-week  45222 non-null  int64 
 12  native-country  45222 non-null  object
 13  income          45222 non-null  object
dtypes: int64(5), object(9)
memory usage: 5.2+ MB
In [26]:
pip install seaborn --upgrade
Requirement already satisfied: seaborn in d:\anaconda3\envs\thktdl\lib\site-packages (0.12.2)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Requirement already satisfied: numpy!=1.24.0,>=1.20 in d:\anaconda3\envs\thktdl\lib\site-packages (from seaborn) (1.26.0)
Requirement already satisfied: pandas>=1.2 in d:\anaconda3\envs\thktdl\lib\site-packages (from seaborn) (2.1.1)
Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in d:\anaconda3\envs\thktdl\lib\site-packages (from seaborn) (3.8.0)
Requirement already satisfied: contourpy>=1.0.1 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.2.0)
Requirement already satisfied: cycler>=0.10 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.4)
Requirement already satisfied: packaging>=20.0 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (23.1)
Requirement already satisfied: pillow>=6.2.0 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (10.2.0)
Requirement already satisfied: pyparsing>=2.3.1 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in d:\anaconda3\envs\thktdl\lib\site-packages (from pandas>=1.2->seaborn) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in d:\anaconda3\envs\thktdl\lib\site-packages (from pandas>=1.2->seaborn) (2023.3)
Requirement already satisfied: six>=1.5 in d:\anaconda3\envs\thktdl\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.16.0)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
   ---------------------------------------- 0.0/294.9 kB ? eta -:--:--
   ---------------------------------------- 0.0/294.9 kB ? eta -:--:--
   - -------------------------------------- 10.2/294.9 kB ? eta -:--:--
   - -------------------------------------- 10.2/294.9 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/294.9 kB 217.9 kB/s eta 0:00:02
   ----- --------------------------------- 41.0/294.9 kB 217.9 kB/s eta 0:00:02
   -------------- ----------------------- 112.6/294.9 kB 467.6 kB/s eta 0:00:01
   -------------- ----------------------- 112.6/294.9 kB 467.6 kB/s eta 0:00:01
   -------------- ----------------------- 112.6/294.9 kB 467.6 kB/s eta 0:00:01
   -------------------------------- ----- 256.0/294.9 kB 749.3 kB/s eta 0:00:01
   -------------------------------------- 294.9/294.9 kB 699.4 kB/s eta 0:00:00
Installing collected packages: seaborn
  Attempting uninstall: seaborn
    Found existing installation: seaborn 0.12.2
    Uninstalling seaborn-0.12.2:
      Successfully uninstalled seaborn-0.12.2
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.
In [16]:
df2 = data.select_dtypes(include=['int64'])
plt.figure(figsize=(16,9))
sns.heatmap(df2.corr(method='pearson'),annot=True)
Out[16]:
<Axes: >
No description has been provided for this image
In [17]:
feature = data.drop('income',axis=1)
label = data['income']
In [18]:
feature.select_dtypes(exclude=['int64']).columns
Out[18]:
Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')
In [19]:
feature_onehot = pd.get_dummies(feature,columns=feature.select_dtypes(exclude=['int64']).columns)
feature_onehot
Out[19]:
age education-num capital-gain capital-loss hours-per-week workclass_ Federal-gov workclass_ Local-gov workclass_ Private workclass_ Self-emp-inc workclass_ Self-emp-not-inc ... native-country_ Portugal native-country_ Puerto-Rico native-country_ Scotland native-country_ South native-country_ Taiwan native-country_ Thailand native-country_ Trinadad&Tobago native-country_ United-States native-country_ Vietnam native-country_ Yugoslavia
0 39 13 2174 0 40 False False False False False ... False False False False False False False True False False
1 50 13 0 0 13 False False False False True ... False False False False False False False True False False
2 38 9 0 0 40 False False True False False ... False False False False False False False True False False
3 53 7 0 0 40 False False True False False ... False False False False False False False True False False
4 28 13 0 0 40 False False True False False ... False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
16275 33 13 0 0 40 False False True False False ... False False False False False False False True False False
16276 39 13 0 0 36 False False True False False ... False False False False False False False True False False
16278 38 13 0 0 50 False False True False False ... False False False False False False False True False False
16279 44 13 5455 0 40 False False True False False ... False False False False False False False True False False
16280 35 13 0 0 60 False False False True False ... False False False False False False False True False False

45222 rows × 103 columns

In [20]:
x_train = feature_onehot[:30162]
x_test = feature_onehot[30162:]
y_train=label[:30162]
y_test=label[30162:]
In [21]:
clf = tree.DecisionTreeClassifier(criterion="entropy",random_state=0)
clf.fit(x_train,y_train)
Out[21]:
DecisionTreeClassifier(criterion='entropy', random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(criterion='entropy', random_state=0)
In [22]:
tree_pred = clf.predict(x_test)

tree_score = metrics.accuracy_score(y_test,tree_pred)
print("Accruracy:",tree_score)
print("Report:",metrics.classification_report(y_test,tree_pred))
Accruracy: 0.8175298804780876
Report:               precision    recall  f1-score   support

       <=50K       0.88      0.88      0.88     11360
        >50K       0.63      0.62      0.62      3700

    accuracy                           0.82     15060
   macro avg       0.75      0.75      0.75     15060
weighted avg       0.82      0.82      0.82     15060

In [23]:
tree_cm = metrics.confusion_matrix(y_test,tree_pred)
In [24]:
plt.figure(figsize=(12,12))
sns.heatmap(tree_cm,annot=True, fmt=".3f",linewidth=.5,square=True,cmap='Blues_r');
plt.xlabel('Actual Label');
plt.ylabel('Predicted Label');
title ='Decision Tree Accuracy Score:{0}'.format(tree_score)
plt.title(title,size=15);
No description has been provided for this image
In [25]:
fig, ax = plt.subplots(figsize=(50,24))
tree.plot_tree(clf,filled=True,fontsize=10)
plt.savefig('decision_tree',dpi=100)
plt.show()
No description has been provided for this image
In [52]:
# Câu 8 thay thế giá trị criterion=’gini’.
clf = tree.DecisionTreeClassifier(criterion="gini",random_state=0)
clf.fit(x_train,y_train)
Out[52]:
DecisionTreeClassifier(random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(random_state=0)
In [53]:
tree_pred = clf.predict(x_test)
tree_score = metrics.accuracy_score(y_test,tree_pred)
print("Accruracy:",tree_score)
print("Report:",metrics.classification_report(y_test,tree_pred))
Accruracy: 0.8122841965471448
Report:               precision    recall  f1-score   support

       <=50K       0.87      0.88      0.88     11360
        >50K       0.62      0.60      0.61      3700

    accuracy                           0.81     15060
   macro avg       0.75      0.74      0.74     15060
weighted avg       0.81      0.81      0.81     15060

In [54]:
tree_cm = metrics.confusion_matrix(y_test,tree_pred)
In [55]:
plt.figure(figsize=(12,12))
sns.heatmap(tree_cm,annot=True, fmt=".3f",linewidth=.5,square=True,cmap='Pastel1');
plt.xlabel('Actual Label');
plt.ylabel('Predicted Label');
title ='Decision Tree Accuracy Score:{0}'.format(tree_score)
plt.title(title,size=15);
No description has been provided for this image
In [56]:
fig, ax = plt.subplots(figsize=(50,24))
tree.plot_tree(clf,filled=True,fontsize=10)
plt.savefig('decision_tree',dpi=100)
plt.show()
No description has been provided for this image
In [57]:
gnb = GaussianNB()
In [58]:
bayes_pred = gnb.fit(x_train, y_train).predict(x_test)
bayes_score = metrics.accuracy_score(y_test, bayes_pred)
print("Accuracy: ", bayes_score)
print("Report: ", metrics.classification_report(y_test, bayes_pred))
Accuracy:  0.8029216467463479
Report:                precision    recall  f1-score   support

       <=50K       0.93      0.80      0.86     11360
        >50K       0.57      0.82      0.67      3700

    accuracy                           0.80     15060
   macro avg       0.75      0.81      0.76     15060
weighted avg       0.84      0.80      0.81     15060

In [59]:
bayes_cm = metrics.confusion_matrix(y_test, bayes_pred)
plt.figure(figsize=(12,12))
sns.heatmap(bayes_cm,annot=True, fmt=".3f",linewidth=.5,square=True,cmap='Greens');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
title = 'Native Bayes Accuracy Score: {0}'.format(bayes_score)
plt.title(title, size=15);
No description has been provided for this image

Độ chính xác:¶

  • Criterion="gini": 0.8122841965471448
  • Criterion="entropy": 0.8175298804780876
  • Native Bayes: 0.8029216467463479

Nhận xét:¶

Cả hai mô hình đều đạt độ chính xác cao trên 80%, tuy nhiên mô hình sử dụng criterion="entropy" có độ chính xác cao hơn một chút (0.5% . Sự chênh lệch này khá nhỏ và có thể không đáng kể về mặt thực tế.876

Kết luận:¶

Cả hai mô hình DecisionTreeClassifier sử dụng criterion="gini" và criterion="entropy" đều đạt hiệu quả cao trong việc phân loại dữ liệu thu nhập.

In [ ]: